در سوالات ۵ و ۹ نمودارها knit نشدند. لطفا برای دیدن آنها کد را ران کنید.
wdiData <- read.csv("~/Downloads/WDI_csv/WDIData.csv")
wdiCountry <- read.csv("~/Downloads/WDI_csv/WDICountry.csv")
wdiSeries <- read.csv("~/Downloads/WDI_csv/WDISeries.csv")
در این سوال ملاک فقر را درصد جمعیت زیر خط فقر گرفتم و بر اساس بیشترین جمعیتها فقیرترین کشورها را حساب کردم. سپس شاخص امید به زندگی این کشورها را در تمام سالهای موجود در داده میانگین گرفتم. همچنین برای حقوق روزانه شاخص حقوق سالیانه را بر تعداد روزهای سال تقسیم کردم که در نمودار اطلاعات آن امدهاست.
poverty <- wdiData %>% filter(Indicator.Code == "SI.POV.NAHC") %>% select(Country.Name, starts_with("X"))
poverty$PercentageOfPeopleBeyondPovertyLine <- poverty %>% select(starts_with("X")) %>% rowMeans(na.rm = T)
tenpoorest <- poverty %>% arrange(desc(PercentageOfPeopleBeyondPovertyLine)) %>% select(Country.Name,PercentageOfPeopleBeyondPovertyLine)%>% top_n(10)
poorcountrieslifetime <- wdiData %>% filter(Indicator.Code == "SP.DYN.LE00.IN" & Country.Name %in% tenpoorest$Country.Name) %>% select(Country.Name, starts_with("X"))
poorcountrieslifetime$meanOfLifeTime <- poorcountrieslifetime %>% select(starts_with("X")) %>% rowMeans(na.rm = T)
poorcountrieslifetime <- poorcountrieslifetime %>% select(Country.Name,meanOfLifeTime)
poors <- inner_join(tenpoorest,poorcountrieslifetime, by = "Country.Name")
poors
## Country.Name PercentageOfPeopleBeyondPovertyLine
## 1 Equatorial Guinea 76.8000
## 2 Zimbabwe 72.3000
## 3 Madagascar 71.6000
## 4 Eritrea 69.0000
## 5 Sao Tome and Principe 67.2500
## 6 Guinea-Bissau 67.0000
## 7 Congo, Dem. Rep. 66.6000
## 8 Burundi 66.0000
## 9 Swaziland 66.0000
## 10 Honduras 62.4125
## meanOfLifeTime
## 1 47.29447
## 2 54.07304
## 3 52.46847
## 4 50.61509
## 5 60.64623
## 6 47.95014
## 7 48.79504
## 8 48.65196
## 9 51.99816
## 10 62.98398
wdiData %>% filter(Country.Name %in% poors$Country.Name & Indicator.Code == "NY.GNP.PCAP.PP.CD") ->annualsalary
annualsalary$dailysalary <- annualsalary %>% select(starts_with("X")) %>% rowMeans(na.rm = T)
annualsalary$dailysalary <- annualsalary$dailysalary / 365
hchart(annualsalary, "column", hcaes(x = Country.Name, y = dailysalary))
همان طور که از نمودار مشخص است در سال ۱۹۹۴ امید به زندگی در کشور روآندا به ۲۷ سال میرسد که مصادف با نسل کشی در این کشور است و حدود ۱ میلیون نفر کشته شدهاند.
lifeexp <- wdiData %>% filter(Indicator.Code == "SP.DYN.LE00.IN") %>% select(Country.Name, starts_with("X"))
lifeexp <- lifeexp[48:264,] %>% melt
rwanda <- wdiData %>% filter(Country.Code == "RWA" & Indicator.Code == "SP.DYN.LE00.IN") %>% select(starts_with("X")) %>% t() %>% data.frame() %>% setDT(keep.rownames = T)
colnames(rwanda) <- c("year","life")
rwanda <- rwanda[1:57,]
hcboxplot(x = lifeexp$value, var = lifeexp$variable,
name = "Length", color = "#2980b9") %>%
hc_chart(type = "column") %>%
hc_add_series(rwanda, "line", hcaes(x= year, y = life))
نمیتوان نتیجه گیری مطمئنی کرد چون پراکندگی دادهها در ابتدای نمودار زیاد است اما طبق خطی که بر دادهها برازش شدهاست به نظر میرسد میان شاخص هزینههای سلامت و امید به زندگی رابطهی مستقیمی وجود دارد.
search<-WDIsearch(string = "health", field = "name", short = TRUE, cache = NULL)
healthexpendituretotaldollor <- wdiData %>% filter(Indicator.Code == "SH.XPD.CHEX.GD.ZS") %>% select(Country.Name, starts_with("X"))
healthexpendituretotaldollor$mean <- healthexpendituretotaldollor %>% select(starts_with("X")) %>% rowMeans(na.rm = T)
lifeexpectancy <- wdiData %>% filter(Indicator.Code == "SP.DYN.LE00.IN") %>% select(Country.Name, starts_with("X"))
lifeexpectancy$mean <- lifeexpectancy %>% select(starts_with("X")) %>% rowMeans(na.rm = T)
lifeandhealth <- inner_join(lifeexpectancy %>% select(Country.Name,mean), healthexpendituretotaldollor %>% select(Country.Name, mean), by = "Country.Name")
colnames(lifeandhealth) <- c("CountryName","LifeExpectancy", "HealthExpenditure")
ggplot(lifeandhealth, aes(x=HealthExpenditure, y=LifeExpectancy)) +
geom_point(shape=1) +
geom_smooth(method=lm,
se=FALSE)
برای این کار از شاخص CPI استفاده شده است. به این معنی که بالاتر رفتن CPI به معنای کم ارزش تر شدن همان مقدار پول است. در نتیجه در طی این سال ها قدرت خرید مردم ایران همواره کاهش یافته است
householdexpeditureUsdollor <- wdiData %>% filter(Country.Code == "IRN" & Indicator.Code == "FP.CPI.TOTL")
householdexpeditureUsdollor <- householdexpeditureUsdollor[,5:63]
householdexpeditureUsdollor <- t(householdexpeditureUsdollor)
householdexpeditureUsdollor <- data.frame(householdexpeditureUsdollor)
householdexpeditureUsdollor <- setDT(householdexpeditureUsdollor, keep.rownames = TRUE)
colnames(householdexpeditureUsdollor) <- c("Year", "Household")
hchart(householdexpeditureUsdollor, "line", hcaes(x = Year, y = Household))
importantFinancialIndicator <- c("FR.INR.LNDP","BN.CAB.XOKA.CD","BX.GSR.GNFS.CD","NE.CON.GOVT.CD", "NE.CON.PETC.CD", "NE.EXP.GNFS.KD.ZG", "FP.CPI.TOTL.ZG","FR.INR.DPST","FR.INR.LEND","SL.GDP.PCAP.EM.KD", "BM.GSR.GNFS.CD" , "NY.GDP.MKTP.KD.ZG","FI.RES.TOTL.CD","FI.RES.TOTL.DT.ZS","FM.LBL.BMNY.CN","FS.AST.CGOV.GD.ZS", "NY.GDP.MKTP.KD", "NY.ADJ.NNAT.CD", "NY.GDP.MKTP.PP.CD","NY.GDP.FCST.CD")
codesandNames <- wdiSeries %>% filter(Series.Code %in% importantFinancialIndicator) %>% select(Series.Code, Indicator.Name)
for (i in 1:20){
indicator <- wdiData %>% filter(Indicator.Code == importantFinancialIndicator[i]) %>% select(Country.Name, starts_with("X"))
indicator <- indicator[48:264,] %>% melt
iran <- wdiData %>% filter(Country.Code == "IRN" & Indicator.Code == importantFinancialIndicator[i]) %>% select(starts_with("X")) %>% t() %>% data.frame() %>% setDT(keep.rownames = T)
colnames(iran) <- c("year","indicator")
iran <- iran[1:57,]
print(hcboxplot(x = indicator$value, var = indicator$variable,
name = "Length", color = "#2980b9",outliers = F) %>%
hc_chart(type = "column") %>%
hc_add_series(name = "Iran" , iran, type = "line", hcaes(x= year, y = indicator),color = "Red") %>%
hc_yAxis(title =list(text =codesandNames$Indicator.Name[i])))
print(hchart(indicator, "line", hcaes(x = variable, y = value, group = Country.Name)))
}
ایران در دستهی سوم همراه با کشورهای چین، قطر و … قرار دارد.
allfinancialIndicators <- wdiData %>% filter(Indicator.Code %in% importantFinancialIndicator) %>% select(Country.Name,Indicator.Name, starts_with("X"))
allfinancialIndicators <- allfinancialIndicators[941:5280,] %>% melt
allfinancialIndicators <- allfinancialIndicators %>% select(Country.Name, Indicator.Name, variable, value)
colnames(allfinancialIndicators) <- c("Country", "Indicator" ,"Year","Value")
allfinancialIndicators <- reshape(allfinancialIndicators,idvar = c("Country","Year"), timevar = "Indicator", direction = "wide")
allfinancialIndicatorsPerCnt <- allfinancialIndicators %>% group_by(Country) %>% summarise_all(funs(mean(., na.rm = TRUE)))
for(i in 3:22){
allfinancialIndicatorsPerCnt[is.na(allfinancialIndicatorsPerCnt[,i]), i] <- colMeans(allfinancialIndicatorsPerCnt[,3:22],na.rm = TRUE)[i-2]
}
scaled <- scale(allfinancialIndicatorsPerCnt[,3:22])
rownames(scaled) <- allfinancialIndicatorsPerCnt$Country
kcl = kmeans(scaled,centers = 3)
kcl$cluster[91]
## Iran, Islamic Rep.
## 1
fviz_cluster(kcl, data = scaled)
بله چرا که خوشهها در راستای دو مولفهی اصلی تشکیل شدهاند.
library(factoextra)
pca <- prcomp(scaled)
fviz_pca_biplot(pca, habillage=as.factor(kcl$cluster))
همانند روشی که در تمارین قبلی استفاده کردیم برای پیشبینی رشد اقتصادی ایران از مقدار رشد در سالهای قبل استفاده میکنیم. تعداد سالهای قبل را عددی بین ۱ و ۱۰ در نظر میگیریم و سپس با برازش خط و مقایسهی خطاها به نظر میرسد بهترین تعداد سال ۲ است. یعنی در صورتی که مقدار رشد در هر سال را با استفاده از دادهی دو سال قبل به دست آوریم بهترین نتیجه را خواهیم گرفت.
gdpgrowth <- wdiData %>% filter(Country.Code == "IRN", Indicator.Code == "NY.GDP.MKTP.KD.ZG") %>% select(starts_with("X"))
gdpgrowth <- data.frame(t(gdpgrowth))
colnames(gdpgrowth) <- c("GDPgrowth")
gdpgrowth[is.na(gdpgrowth)] <- mean(gdpgrowth$GDPgrowth, na.rm = T)
gdpgrowth$n <- seq.int(nrow(gdpgrowth))
for (k in (1:10)){
df <- data.frame()
if (k != 1){
for (i in 1:(nrow(gdpgrowth)-k)){
df <- rbind(df, gdpgrowth$GDPgrowth[i:(i+k)])
}
print(k)
print(mean(summary(lm(df[,ncol(df)]~., data = df))$residuals^2))
}
if(k == 1){
df <- gdpgrowth$GDPgrowth[1:nrow(gdpgrowth)-1]
df <- cbind(df, gdpgrowth$GDPgrowth[2:nrow(gdpgrowth)])
df <- data.frame(df)
print(mean(summary(lm(V2~., data = df))$residuals^2))
}
}
## [1] 53.24525
## [1] 2
## [1] 2.731712e-31
## [1] 3
## [1] 1.333459e-30
## [1] 4
## [1] 1.711332e-30
## [1] 5
## [1] 1.660426e-29
## [1] 6
## [1] 1.227554e-29
## [1] 7
## [1] 2.474536e-29
## [1] 8
## [1] 1.783535e-29
## [1] 9
## [1] 7.847749e-30
## [1] 10
## [1] 4.775992e-31
df <- data.frame()
for (i in 1:(nrow(gdpgrowth)-2)){
df <- rbind(df, gdpgrowth$GDPgrowth[i:(i+2)])
}
importantEducationalIndicator <- c("SE.ADT.1524.LT.FE.ZS","SE.ADT.LITR.ZS","SE.ADT.1524.LT.MA.ZS","SE.COM.DURS", "SE.PRE.ENRL.TC.ZS", "SE.PRE.ENRR", "SE.PRM.AGES","SE.PRM.CMPT.ZS","SE.PRM.OENR.ZS", "SE.PRM.PRSL.ZS" , "SE.PRM.REPT.ZS","SE.PRM.TCAQ.ZS","SE.PRM.UNER","SE.PRM.UNER.FE.ZS","SE.PRM.UNER.MA.ZS", "SE.SEC.PROG.ZS", "SE.SEC.UNER.LO.ZS","SE.XPD.TOTL.GB.ZS","SE.XPD.TOTL.GD.ZS", "SE.XPD.PRIM.ZS")
codesandNamesedu <- wdiSeries %>% filter(Series.Code %in% importantEducationalIndicator) %>% select(Series.Code, Indicator.Name)
for (i in 1:20){
indicator <- wdiData %>% filter(Indicator.Code == importantEducationalIndicator[i]) %>% select(Country.Name, starts_with("X"))
indicator <- indicator[48:264,] %>% melt
iran <- wdiData %>% filter(Country.Code == "IRN" & Indicator.Code == importantEducationalIndicator[i]) %>% select(starts_with("X")) %>% t() %>% data.frame() %>% setDT(keep.rownames = T)
colnames(iran) <- c("year","indicator")
iran <- iran[1:57,]
print(hcboxplot(x = indicator$value, var = indicator$variable,
name = "Length", color = "#2980b9",outliers = F) %>%
hc_chart(type = "column") %>%
hc_add_series(name = "Iran" , iran, type = "line", hcaes(x= year, y = indicator),color = "Red") %>%
hc_yAxis(title =list(text =codesandNamesedu$Indicator.Name[i])))
#print(hchart(indicator, "line", hcaes(x = variable, y = value, group = Country.Name)))
}
alleducationalIndicators <- wdiData %>% filter(Indicator.Code %in% importantEducationalIndicator) %>% select(Country.Name,Indicator.Name, starts_with("X"))
alleducationalIndicators <- alleducationalIndicators[941:5280,] %>% melt
alleducationalIndicators <- alleducationalIndicators %>% select(Country.Name, Indicator.Name, variable, value)
colnames(alleducationalIndicators) <- c("Country", "Indicator" ,"Year","Value")
alleducationalIndicators <- reshape(alleducationalIndicators,idvar = c("Country","Year"), timevar = "Indicator", direction = "wide")
alleducationalIndicatorsPerCnt <- alleducationalIndicators %>% group_by(Country) %>% summarise_all(funs(mean(., na.rm = TRUE)))
for(i in 3:22){
alleducationalIndicatorsPerCnt[is.na(alleducationalIndicatorsPerCnt[,i]), i] <- colMeans(alleducationalIndicatorsPerCnt[,3:22],na.rm = TRUE)[i-2]
}
scalededu <- scale(alleducationalIndicatorsPerCnt[,3:22])
kcledu = kmeans(scalededu,centers = 3)
fviz_cluster(kcledu, data = scaled)
pcaedu <- prcomp(scalededu)
fviz_pca_biplot(pcaedu, habillage=as.factor(kcledu$cluster))
importantHealthIndicator <- c("SH.ANM.ALLW.ZS","SH.ANM.CHLD.ZS","SH.CON.1524.MA.ZS","SH.DTH.0514", "SH.DTH.INJR.ZS", "SH.DTH.MORT", "SH.DYN.0514","SH.DYN.AIDS.ZS", "SH.FPL.SATM.ZS" ,"SH.H2O.SMDW.UR.ZS","SH.H2O.SMDW.RU.ZS","SH.MMR.DTHS","SH.SGR.CRSK.ZS", "SH.MED.NUMW.P3", "SH.MED.BEDS.ZS","SH.MLR.INCD.P3","SH.PRV.SMOK.FE", "SH.PRV.SMOK.MA","SH.STA.BASS.UR.ZS","SH.STA.BRTW.ZS")
codesandNameshealth <- wdiSeries %>% filter(Series.Code %in% importantHealthIndicator) %>% select(Series.Code, Indicator.Name)
for (i in 1:20){
indicator <- wdiData %>% filter(Indicator.Code == importantHealthIndicator[i]) %>% select(Country.Name, starts_with("X"))
indicator <- indicator[48:264,] %>% melt
iranhealth <- wdiData %>% filter(Country.Code == "IRN" & Indicator.Code == importantHealthIndicator[i]) %>% select(starts_with("X")) %>% t() %>% data.frame() %>% setDT(keep.rownames = T)
colnames(iranhealth) <- c("year","indicator")
iranhealth <- iranhealth[1:57,]
print(hcboxplot(x = indicator$value, var = indicator$variable,
outliers = F) %>%
hc_chart(type = "column") %>%
hc_add_series(name = "Iran" , iranhealth, type = "line", hcaes(x= year, y = indicator),color = "Red") %>%
hc_yAxis(title =list(text =codesandNameshealth$Indicator.Name[i])))
#print(hchart(indicator, "line", hcaes(x = variable, y = value, group = Country.Name)))
}
allhealthIndicators <- wdiData %>% filter(Indicator.Code %in% importantHealthIndicator) %>% select(Country.Name,Indicator.Name, starts_with("X"))
allhealthIndicators <- allhealthIndicators[941:5280,] %>% melt
allhealthIndicators <- allhealthIndicators %>% select(Country.Name, Indicator.Name, variable, value)
colnames(allhealthIndicators) <- c("Country", "Indicator" ,"Year","Value")
allhealthIndicators <- reshape(allhealthIndicators,idvar = c("Country","Year"), timevar = "Indicator", direction = "wide")
allhealthIndicatorsPerCnt <- allhealthIndicators %>% group_by(Country) %>% summarise_all(funs(mean(., na.rm = TRUE)))
for(i in 3:22){
allhealthIndicatorsPerCnt[is.na(allhealthIndicatorsPerCnt[,i]), i] <- colMeans(allhealthIndicatorsPerCnt[,3:22],na.rm = TRUE)[i-2]
}
scaledhealth <- scale(allhealthIndicatorsPerCnt[,3:22])
kclhealth = kmeans(scaledhealth,centers = 3)
fviz_cluster(kclhealth, data = scaled)
pcahealth <- prcomp(scaledhealth)
fviz_pca_biplot(pcahealth, habillage=as.factor(kclhealth$cluster))
allindicators <- c(importantEducationalIndicator,importantFinancialIndicator,importantHealthIndicator)
all <- wdiData %>% filter(Indicator.Code %in% allindicators)
all <- all[2821:15840,5:63]
for(i in 1:59){
all[is.na(all[,i]), i] <- colMeans(all,na.rm = TRUE)[i]
}
dist = stats::dist(all,method = "euclidean")
clus = hclust(dist,method = "complete")
plot(clus)
کشورهایی که بیشترین جمعیت در بزرگترین شهر خود دارند.
wdiData %>% filter(Indicator.Code == "EN.URB.LCTY") -> largestCityPopulation
largestCityPopulation[48:264,] %>% select(Country.Name, X2016) %>% arrange(desc(X2016)) %>% top_n(10)
## Selecting by X2016
## Country.Name X2016
## 1 Japan 38139625
## 2 India 26453827
## 3 China 24483789
## 4 Brazil 21296830
## 5 Mexico 21157173
## 6 Egypt, Arab Rep. 19127890
## 7 United States 18603963
## 8 Bangladesh 18237104
## 9 Pakistan 17121434
## 10 Argentina 15333630
در این سوال رابطهی میانگین قیمت گازوییل و آلودگی هوا در هفت سال اخیر در کشورهای مختلف جهان بررسی شده است. با توجه به نمودار رسم شده و همینطور تست کورلیشن به نظر میرسد رابطهی خطی میان این دو وجود دارد به این صورت که بالاتر رفتن قیمت گازوییل آلودگی هم کمتر شدهاست.
wdiData %>% filter(Indicator.Code == "EP.PMP.SGAS.CD" | Indicator.Code == "EN.ATM.PM25.MC.M3") -> gasolinePriceAndPollution
gasolinePriceAndPollution <- gasolinePriceAndPollution[95:528,]
GasPricemean <- rowMeans((gasolinePriceAndPollution %>% filter(Indicator.Code == "EP.PMP.SGAS.CD"))[,55:63], na.rm = T)
Pollutionmean <- rowMeans((gasolinePriceAndPollution %>% filter(Indicator.Code == "EN.ATM.PM25.MC.M3"))[,55:63], na.rm = T)
means <- cbind(Pollutionmean,GasPricemean)
means <- cbind(means,largestCityPopulation[48:264,] %>% select(Country.Name))
hchart(means, type = "scatter" ,hcaes(x = Pollutionmean,y = GasPricemean))
cor.test(means$Pollutionmean,means$GasPricemean)
##
## Pearson's product-moment correlation
##
## data: means$Pollutionmean and means$GasPricemean
## t = -5.9758, df = 174, p-value = 1.262e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5283242 -0.2819425
## sample estimates:
## cor
## -0.4126529
نمودار امید به زندگی در ایران که به طور کلی در طی این ۵۰ سال حدود ۳۰ سال زیاد شدهاست ولی در طول سالهای جنگ مقداری کاهش داشتهاست.
iran <- wdiData %>% filter(Country.Code == "IRN" & Indicator.Code == "SP.DYN.LE00.IN") %>% select(starts_with("X")) %>% t() %>% data.frame() %>% setDT(keep.rownames = T)
colnames(iran) <- c("year","life")
iran <- iran[1:57,]
hcboxplot(x = lifeexp$value, var = lifeexp$variable,
name = "Length", color = "#2980b9") %>%
hc_chart(type = "column") %>%
hc_add_series(iran, "line", hcaes(x= year, y = life))